Set coding environment
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
rm(list = ls()) # clean environment
cat("\014") # clean console
Load libraries (install.packages if not found)
library(datasets)
library(caret)
library(GGally)
library(viridis)
library(plotly)
library(cluster)
Import dataset and explore
dim(iris)
## [1] 150 5
sapply(iris, function(x) length(unique(x)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 35 23 43 22 3
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
df = iris
Preprocessing
norm.values = preProcess(iris[,-5], method = c("center","scale"))
iris.norm = predict(norm.values,iris[,-5])
head(iris.norm)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 -0.8976739 1.01560199 -1.335752 -1.311052
## 2 -1.1392005 -0.13153881 -1.335752 -1.311052
## 3 -1.3807271 0.32731751 -1.392399 -1.311052
## 4 -1.5014904 0.09788935 -1.279104 -1.311052
## 5 -1.0184372 1.24503015 -1.335752 -1.311052
## 6 -0.5353840 1.93331463 -1.165809 -1.048667
Clustering
#best cluster based on WSSE
k = seq(1,15,1)
metrics.df <- data.frame(k, wsse = rep(0, length(k)))
for (i in 1:length(k)){
cluster_i = kmeans(iris.norm, centers = i, nstart = 25)
metrics.df[i, 2] = cluster_i$tot.withinss
}
plot_kmeans = ggplot(metrics.df, aes(x=k, y=wsse)) + geom_line(linetype = 3) + geom_point(shape=1) + ylab("Total Within Sum of Squares") + scale_x_continuous(breaks = k) + theme_classic()
plot_kmeans
set.seed(1)
k_clusters = kmeans(iris.norm, 3, nstart = 25)
k_clusters$size
## [1] 53 47 50
df$clusters = as.factor(k_clusters$cluster)
ggparcoord(df[,-5], columns = 1:4, groupColumn = 5, order = "anyClass", scale="globalminmax", showPoints = T, title = "Parallel Plot Iris dataset with k-clusters", alphaLines = 0.3) + scale_color_viridis(discrete = T, option = "E") + theme(plot.title = element_text(size=10)) + coord_flip() + theme_bw()
centroids = data.frame(k_clusters$centers)
scale_back_z_score <- function(z,var) {
sd_train = sd(df[,var])
mean_train = mean(df[,var])
x = sapply(z, function(x_) x_*sd_train + mean_train)
return(x)
}
centroids$Sepal.Length = scale_back_z_score(centroids$Sepal.Length, "Sepal.Length")
centroids$Sepal.Width = scale_back_z_score(centroids$Sepal.Width, "Sepal.Width")
centroids$Petal.Length = scale_back_z_score(centroids$Petal.Length, "Petal.Length")
centroids$Petal.Width = scale_back_z_score(centroids$Petal.Width, "Petal.Width")
p <- plot_ly(df[,-c(1,5)], x = ~Petal.Length, z = ~Petal.Width, y = ~Sepal.Width)
fig <- p %>% add_markers(color = ~clusters, size = 10) %>% add_mesh(opacity=0.1, data=df[df$clusters==2,], alphahull =0) %>% add_mesh(opacity=0.1, data=df[df$clusters==3,], alphahull = 0) %>% add_mesh(opacity=0.1, data=df[df$clusters==1,], alphahull = 0) %>% add_trace(data = centroids, type = "scatter3d", x = ~Petal.Length, z = ~Petal.Width, y = ~Sepal.Width, mode = "markers", marker = list(size = 5, color = "black", symbol="cross"), name="centroids")
fig
#check consistency
sil = silhouette(k_clusters$cluster, dist(df[,-c(5,6)]))
plot(sil, col=1:3, border=NA)